Importo librerias¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

Funciones¶

Buenas prácticas

Voy guardando las funciones que están automatizadas y pienso que me van a servir en otros proyectos en un funciones_auxiliares.py y lo importo:

In [2]:
def plot_feature(df, col_name, isContinuous, target):
    """
    Visualize a variable with and without faceting on the loan status.
    - df dataframe
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)

    count_null = df[col_name].isnull().sum()
    if isContinuous:

        sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(df, x=col_name, color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
    plt.xticks(rotation = 90)


    if isContinuous:
        sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(col_name + ' by '+target)
    else:
        data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
        data.columns = [i, target, 'proportion']
        #sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        ax2.set_ylabel(target+' fraction')
        ax2.set_title(target)
        plt.xticks(rotation = 90)
    ax2.set_xlabel(col_name)

    plt.tight_layout()


def dame_variables_categoricas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función dame_variables_categoricas:
    ----------------------------------------------------------------------------------------------------------
        -Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las 
        variables categóricas
        -Inputs: 
            -- dataset: Pandas dataframe que contiene los datos
        -Return:
            -- lista_variables_categoricas: lista con los nombres de las variables categóricas del
            dataset de entrada con menos de 100 valores diferentes
            -- 1: la ejecución es incorrecta
    '''
    if dataset is None:
        print(u'\nFaltan argumentos por pasar a la función')
        return 1
    lista_variables_categoricas = []
    other = []
    for i in dataset.columns:
        if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                lista_variables_categoricas.append(i)
            else:
                other.append(i)

    return lista_variables_categoricas, other


def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
    # Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'

    if dataset is None:
        print(u'\nHace falta pasar argumentos a la función')
        return 1
    sns.set(style="white")
    # Compute the correlation matrix
    corr = dataset.corr(method=metodo)
    # Set self-correlation to zero to avoid distraction
    for i in range(corr.shape[0]):
        corr.iloc[i, i] = 0
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=size_figure)
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, center=0,
                square=True, linewidths=.5,  cmap ='viridis' ) #cbar_kws={"shrink": .5}
    plt.show()

    return 0

def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
    """
    Devuelve el porcentaje de valores que exceden del intervalo de confianza
    :type series:
    :param multiplier:
    :return:
    """
    pd_final = pd.DataFrame()

    for i in list_var_continuous:

        series_mean = pd_loan[i].mean()
        series_std = pd_loan[i].std()
        std_amp = multiplier * series_std
        left = series_mean - std_amp
        right = series_mean + std_amp
        size_s = pd_loan[i].size

        perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
        perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s

        if perc_excess>0:
            pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)] \
                                             .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
                                         pd_concat_percent.iloc[0,1]]
            pd_concat_percent = pd_concat_percent.drop(target,axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
            pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)

    if pd_final.empty:
        print('No existen variables con valores nulos')

    return pd_final


def get_percent_null_values_target(pd_loan, list_var_continuous, target):

    pd_final = pd.DataFrame()
    for i in list_var_continuous:
        if pd_loan[i].isnull().sum()>0:
            pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()] \
                                             .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
                                         pd_concat_percent.iloc[0,1]]
            pd_concat_percent = pd_concat_percent.drop(target,axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
            pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)

    if pd_final.empty:
        print('No existen variables con valores nulos')

    return pd_final



def cramers_v(confusion_matrix):
    """ 
    calculate Cramers V statistic for categorial-categorial association.
    uses correction from Bergsma and Wicher,
    Journal of the Korean Statistical Society 42 (2013): 323-328
    
    confusion_matrix: tabla creada con pd.crosstab()
    
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

Lectura de datos del preprocesado inicial¶

Leo el dataset

In [3]:
df_fraud = pd.read_csv("../data/pd_data_initial_preprocessing.csv").drop('Unnamed: 0',axis=1)
df_fraud.shape
Out[3]:
(1000000, 32)
In [4]:
df_fraud.columns
Out[4]:
Index(['intended_balcon_amount', 'prev_address_months_count',
       'bank_months_count', 'current_address_months_count',
       'session_length_in_minutes', 'fraud_bool', 'foreign_request',
       'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit',
       'device_os', 'source', 'housing_status', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'phone_home_valid',
       'credit_risk_score', 'email_is_free', 'income', 'employment_status',
       'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w',
       'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w',
       'payment_type', 'days_since_request', 'customer_age',
       'name_email_similarity', 'month'],
      dtype='object')
In [5]:
list_var_cat, other = dame_variables_categoricas(dataset=df_fraud)
df_fraud[list_var_cat] = df_fraud[list_var_cat].astype("category")
list_var_continuous = list(df_fraud.select_dtypes('float').columns)
df_fraud[list_var_continuous] = df_fraud[list_var_continuous].astype(float)
df_fraud.dtypes
Out[5]:
intended_balcon_amount               float64
prev_address_months_count            float64
bank_months_count                    float64
current_address_months_count         float64
session_length_in_minutes            float64
fraud_bool                          category
foreign_request                      float64
phone_mobile_valid                   float64
has_other_cards                      float64
proposed_credit_limit                float64
device_os                           category
source                              category
housing_status                      category
keep_alive_session                   float64
device_distinct_emails_8w            float64
device_fraud_count                   float64
phone_home_valid                     float64
credit_risk_score                    float64
email_is_free                        float64
income                               float64
employment_status                   category
date_of_birth_distinct_emails_4w     float64
bank_branch_count_8w                 float64
velocity_4w                          float64
velocity_24h                         float64
velocity_6h                          float64
zip_count_4w                         float64
payment_type                        category
days_since_request                   float64
customer_age                         float64
name_email_similarity                float64
month                                float64
dtype: object

Este código prepara el conjunto de datos de nuestro dataframe para el análisis exploratorio, asegurándonos de que las variables categóricas y las variables continuas estén en el formato correcto.

Separación en train y test estratificado¶

In [6]:
df_fraud_bool = df_fraud['fraud_bool'] \
    .value_counts(normalize=True) \
    .mul(100).rename('percent').reset_index()

df_fraud_bool_conteo = df_fraud['fraud_bool'].value_counts().reset_index()
df_fraud_bool_pc = pd.merge(df_fraud_bool, df_fraud_bool_conteo, on=['fraud_bool'], how='inner')
df_fraud_bool_pc
fig = px.histogram(df_fraud_bool_pc, x="fraud_bool", y=['percent'])
fig.update_xaxes(tickvals = [0, 1])
fig.show()

En este código creamos dos dataframes, el primero contiene el porcentaje de los valores de 'fraud_bool', y el segundo contiene el conteo absoluto de cada valor. Luego se fusionan esos dataframes en uno solo utilizando la columna de valores únicos 'index'. Por último, realizamos un gráfico que nos muestra las probabilidades de fraude, siendo 0 ausencia de fraude y 1 fraude.

In [7]:
from sklearn.model_selection import train_test_split
X_df_fraud, X_df_fraud_test, y_df_fraud, y_df_fraud_test = train_test_split(df_fraud.drop('fraud_bool',axis=1),
                                                                        df_fraud['fraud_bool'],
                                                                        stratify=df_fraud['fraud_bool'],
                                                                        test_size=0.2)
df_fraud_train = pd.concat([X_df_fraud, y_df_fraud],axis=1)
df_fraud_test = pd.concat([X_df_fraud_test, y_df_fraud_test],axis=1)

Dividimos nuestro conjunto de datos en train y test, con una proporción del 80% y 20% respectivamente

In [8]:
print('== Train\n', df_fraud_train['fraud_bool'].value_counts(normalize=True))
print('== Test\n', df_fraud_test['fraud_bool'].value_counts(normalize=True))
== Train
 fraud_bool
0    0.988971
1    0.011029
Name: proportion, dtype: float64
== Test
 fraud_bool
0    0.98897
1    0.01103
Name: proportion, dtype: float64

Visualización descriptiva de los datos¶

In [9]:
pd_series_null_columns = df_fraud_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = df_fraud_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)

pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])
pd_null_filas['target'] = df_fraud['fraud_bool'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/df_fraud_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/df_fraud_train.shape[1]
(32,) (800000,)

Vemos el número de valores nulos por filas y por columnas

In [10]:
pd_null_columnas
Out[10]:
nulos_columnas porcentaje_columnas
intended_balcon_amount 593804 0.742255
prev_address_months_count 570153 0.712691
bank_months_count 202916 0.253645
current_address_months_count 3442 0.004302
session_length_in_minutes 1598 0.001998
velocity_6h 0 0.000000
date_of_birth_distinct_emails_4w 0 0.000000
bank_branch_count_8w 0 0.000000
velocity_4w 0 0.000000
velocity_24h 0 0.000000
payment_type 0 0.000000
zip_count_4w 0 0.000000
income 0 0.000000
days_since_request 0 0.000000
customer_age 0 0.000000
name_email_similarity 0 0.000000
month 0 0.000000
employment_status 0 0.000000
credit_risk_score 0 0.000000
email_is_free 0 0.000000
phone_home_valid 0 0.000000
device_fraud_count 0 0.000000
device_distinct_emails_8w 0 0.000000
keep_alive_session 0 0.000000
housing_status 0 0.000000
source 0 0.000000
device_os 0 0.000000
proposed_credit_limit 0 0.000000
has_other_cards 0 0.000000
phone_mobile_valid 0 0.000000
foreign_request 0 0.000000
fraud_bool 0 0.000000
In [11]:
pd_null_filas.head()
Out[11]:
nulos_filas target porcentaje_filas
483460 4 0 0.125
525137 4 0 0.125
448512 4 0 0.125
779819 4 0 0.125
69027 4 0 0.125

Distribución del resto de variables

In [12]:
# warnings.filterwarnings('ignore')
for i in list(df_fraud_train.columns):
    if (df_fraud_train[i].dtype==float) & (i!='fraud_bool'):
        plot_feature(df_fraud_train, col_name=i, isContinuous=True, target='fraud_bool')
    elif  i!='fraud_bool':
        plot_feature(df_fraud_train, col_name=i, isContinuous=False, target='fraud_bool')
C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:9: RuntimeWarning:

More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning:

is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead

C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Se generan visualizaciones para las características del conjunto de datos, considerando si son de tipo float o categóricas, en relación con la variable objetivo 'fraud_bool'.

Por ejemplo, hay un gráfico de barras que compara la proporción de fraudes ('fraud_bool') en función del sistema operativo del dispositivo ('device_os'). Las barras representan el porcentaje de fraudes para cada sistema operativo, calculado sobre el total de observaciones en ese sistema. Se observa que las barras correspondientes al valor 0 son significativamente más altas en todos los sistemas operativos,indicando que la mayoría de las observaciones no son fraudes. Sin embargo, entre las barras del valor 1,las más altas se encuentran en los sistemas operativos 'Windows' y 'Macintosh', sugiriendo una posible asociaciónentre estos sistemas y un mayor porcentaje de fraudes.

Otro ejemplo es el gráfico de caja (boxplot) que compara la distribución del ingreso ('income') en función de la la proporción de fraudes ('fraud_bool'). Para las observaciones sin fraude, el boxplot muestra que la mayoría de los ingresos se encuentran entre 0.3 y 0.8, con una mediana en 0.6. Por otro lado, para las observaciones con fraude el rango de ingresos es más pequeño, y la mayoría de los valores se encuentran entre 0.6 y 0.9, situándose la median en 0.8. Se observa un outlier en 0.1. Este boxplot proporciona información sobre la variabilidad en los ingresos para las categorías de fraude y no fraude.

Tratamiento de las variables continuas¶

A continuación, se tratan los valores missing, las correlaciones de las variables continuas y los outliers

In [13]:
list_var_continuous
Out[13]:
['intended_balcon_amount',
 'prev_address_months_count',
 'bank_months_count',
 'current_address_months_count',
 'session_length_in_minutes',
 'foreign_request',
 'phone_mobile_valid',
 'has_other_cards',
 'proposed_credit_limit',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'phone_home_valid',
 'credit_risk_score',
 'email_is_free',
 'income',
 'date_of_birth_distinct_emails_4w',
 'bank_branch_count_8w',
 'velocity_4w',
 'velocity_24h',
 'velocity_6h',
 'zip_count_4w',
 'days_since_request',
 'customer_age',
 'name_email_similarity',
 'month']

Tratamiento de outliers¶

In [14]:
get_deviation_of_mean_perc(df_fraud_train, list_var_continuous, target='fraud_bool', multiplier=3)
Out[14]:
0.0 1.0 variable sum_outlier_values porcentaje_sum_null_values
0 0.989886 0.010114 intended_balcon_amount 1582 0.001978
1 0.993456 0.006544 prev_address_months_count 6877 0.008596
2 0.983702 0.016298 current_address_months_count 17119 0.021399
3 0.978821 0.021179 session_length_in_minutes 18839 0.023549
4 0.978306 0.021694 foreign_request 20328 0.025410
5 0.869619 0.130381 proposed_credit_limit 4878 0.006097
6 0.961725 0.038275 device_distinct_emails_8w 25500 0.031875
7 0.964863 0.035137 credit_risk_score 2846 0.003557
8 0.993676 0.006324 date_of_birth_distinct_emails_4w 5060 0.006325
9 0.989364 0.010636 bank_branch_count_8w 32720 0.040900
10 0.997706 0.002294 velocity_24h 436 0.000545
11 0.993050 0.006950 velocity_6h 3453 0.004316
12 0.990533 0.009467 zip_count_4w 12993 0.016241
13 0.987631 0.012369 days_since_request 14229 0.017786
14 0.956782 0.043218 customer_age 6340 0.007925

Los valores outlier se pueden sustituir por la media, mediana, valores extremos (media+3std o media-3std). Tras el siguiente análisis, hemos decidido como primera iteración dejarlos sin sustituir. Una vez llegue al modelo puedo realizar iteraciones utilizando diferentes métodos para comprobar si mejora el modelo

Correlaciones¶

In [15]:
get_corr_matrix(dataset = df_fraud_train[list_var_continuous],
                metodo='pearson', size_figure=[10,8])
Out[15]:
0

Este código genera y muestra una matriz de correlación entre variables continuas en el conjunto de datos. En el gráfico resultante, la mayoría de los puntos están cercanos a 0, indicando una baja correlación entre las variables. Destaca una correlación más notable entre 'credit_risk_score' y 'proposed_credit_limit'. Esto sugiere que un puntaje de riesgo crediticio más alto tiende a asociarse con un límite de crédito propuesto más alto, lo cual tiene sentido desde una perspectiva financiera.

In [16]:
corr = df_fraud_train[list_var_continuous].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
Out[16]:
level_0 level_1 correlation
643 month velocity_4w 0.848145
334 credit_risk_score proposed_credit_limit 0.605456

El código identifica y muestra los pares de variables continuas que tienen una correlación absoluta mayor a 0.6, lo que podría indicar una relación más fuerte entre esas variables.

Tratamiento de valores nulos¶

¿Son todos los nulos de una clase de la variable objetivo? o tienen el mismo porcentaje de la variable objetivo?

In [17]:
list_var_continuous
Out[17]:
['intended_balcon_amount',
 'prev_address_months_count',
 'bank_months_count',
 'current_address_months_count',
 'session_length_in_minutes',
 'foreign_request',
 'phone_mobile_valid',
 'has_other_cards',
 'proposed_credit_limit',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'phone_home_valid',
 'credit_risk_score',
 'email_is_free',
 'income',
 'date_of_birth_distinct_emails_4w',
 'bank_branch_count_8w',
 'velocity_4w',
 'velocity_24h',
 'velocity_6h',
 'zip_count_4w',
 'days_since_request',
 'customer_age',
 'name_email_similarity',
 'month']
In [18]:
get_percent_null_values_target(df_fraud_train, list_var_continuous, target='fraud_bool')
Out[18]:
0.0 1.0 variable sum_null_values porcentaje_sum_null_values
0 0.986863 0.013137 intended_balcon_amount 593804 0.742255
1 0.985802 0.014198 prev_address_months_count 570153 0.712691
2 0.983846 0.016154 bank_months_count 202916 0.253645
3 0.996223 0.003777 current_address_months_count 3442 0.004302
4 0.989362 0.010638 session_length_in_minutes 1598 0.001998

Opción 0:¶

Algunos algoritmos aceptan en su input valores missing

Opción 1:¶

Eliminar todas las filas que tengan valores nulos. En nuestro dataset no es lo más optimo debido a que hay bastantes filas que les ocurre esta situación

Opción 2:¶

Imputar los valores missing por:

media mediana maximo minimo valores extremos https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html

Decido rellenar todas las columnas continuas menos session_length_in_minutes por el valor -99. De esta manera, diferencio los outlier del resto de la muestra poninendo un valor muy separado del resto de la variable. Se puede explorar el resultado del modelo utilizando diferentes métodos

In [19]:
list_vars = list(set(list_var_continuous)-set(['session_length_in_minutes']))
df_fraud_train[list_vars] = df_fraud_train[list_vars].fillna(-99)
df_fraud_test[list_vars] = df_fraud_test[list_vars].fillna(-99)
In [20]:
list_vars
Out[20]:
['device_distinct_emails_8w',
 'velocity_4w',
 'prev_address_months_count',
 'velocity_6h',
 'email_is_free',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'foreign_request',
 'credit_risk_score',
 'velocity_24h',
 'name_email_similarity',
 'income',
 'phone_mobile_valid',
 'proposed_credit_limit',
 'phone_home_valid',
 'current_address_months_count',
 'intended_balcon_amount',
 'bank_months_count',
 'zip_count_4w',
 'customer_age',
 'month',
 'keep_alive_session',
 'device_fraud_count',
 'has_other_cards',
 'days_since_request']
In [21]:
df_fraud_test['session_length_in_minutes'].isnull().sum() 
Out[21]:
417
In [22]:
get_percent_null_values_target(df_fraud_test, list_var_continuous, target='fraud_bool')
Out[22]:
0.0 1.0 variable sum_null_values porcentaje_sum_null_values
0 0.997602 0.002398 session_length_in_minutes 417 0.002085

Opción 3:¶

https://scikit-learn.org/stable/modules/impute.html. Utilizar un modelo de regresión para rellenar los valores mmissing de alguna variable muy importante, por ejemplo: KNN, regresion lineal, xgboost. Pero, cuidado con el sobreajuste. Vamos a usar KNNImputer para imputar los valores missing de la variable emp_length usando como regresoras todas las variables continuas

In [23]:
X_train = df_fraud_train[list(set(list_var_continuous))]
X_test = df_fraud_test[list(set(list_var_continuous))]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
model = imputer.fit(X_train)
pd_input_train = pd.DataFrame(model.transform(X_train),
                              columns=[i+'_input' for i in list(set(list_var_continuous))],index=df_fraud_train.index)
pd_input_test = pd.DataFrame(model.transform(X_test),
                             columns=[i+'_input' for i in list(set(list_var_continuous))],index=df_fraud_test.index)

df_fraud_input_train = pd.concat([df_fraud_train, pd_input_train],axis=1).drop(list(set(list_var_continuous)),axis=1)
df_fraud_input_test = pd.concat([df_fraud_test, pd_input_test],axis=1).drop(list(set(list_var_continuous)),axis=1)

El código utiliza KNN para imputar los valores faltantes en las variables continuas del conjunto de datos, creando nuevos conjuntos de datos (df_fraud_input_train y df_fraud_input_test) con los valores imputados.

In [24]:
df_fraud_input_train.shape
Out[24]:
(800000, 32)
In [25]:
get_percent_null_values_target(df_fraud_input_train, [i+'_input' for i in list_var_continuous], target='fraud_bool')
No existen variables con valores nulos
Out[25]:
In [26]:
list_var_continuous = list(df_fraud_input_train.select_dtypes('float').columns)
get_corr_matrix(dataset = df_fraud_input_train[list_var_continuous],
                metodo='pearson', size_figure=[10,8])
Out[26]:
0

Este código selecciona las variables continuas imputadas en el train, y genera una matriz de correlación para explorar las relaciones lineales entre estas varibales, como realizamos en uno de los pasos anteriores.

In [27]:
df_fraud_input_train.columns
Out[27]:
Index(['device_os', 'source', 'housing_status', 'employment_status',
       'payment_type', 'fraud_bool', 'prev_address_months_count_input',
       'bank_branch_count_8w_input', 'credit_risk_score_input',
       'velocity_24h_input', 'income_input',
       'current_address_months_count_input', 'bank_months_count_input',
       'zip_count_4w_input', 'customer_age_input', 'month_input',
       'keep_alive_session_input', 'device_fraud_count_input',
       'device_distinct_emails_8w_input', 'velocity_4w_input',
       'velocity_6h_input', 'email_is_free_input',
       'date_of_birth_distinct_emails_4w_input', 'foreign_request_input',
       'name_email_similarity_input', 'phone_mobile_valid_input',
       'proposed_credit_limit_input', 'phone_home_valid_input',
       'intended_balcon_amount_input', 'session_length_in_minutes_input',
       'has_other_cards_input', 'days_since_request_input'],
      dtype='object')

Tratamiento de las variables categoricas¶

Para la correlacion de spearman es necesario convertir las variables categoricas en numericas y luego obtener la correlación

In [28]:
list_var_cat
Out[28]:
['fraud_bool',
 'device_os',
 'source',
 'housing_status',
 'employment_status',
 'payment_type']
In [29]:
confusion_matrix = pd.crosstab(df_fraud_input_train["fraud_bool"], df_fraud_input_train["device_os"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
device_os    linux  macintosh   other  windows   x11
fraud_bool                                          
0           264784      42416  272503   205774  5700
1             1384        590    1551     5239    59
Out[29]:
0.08110361070236907

Este código nos devuelve la matriz de confusión y el valor del coeficiente Cramér's V, que indica la fuerza de la asociación entre las variables categóricas "fraud_bool" y "device_os". Un valor cercano a 0 sugiere una asociación débil, mientras que un valor cercano a 1 indica una asociación más fuerte.

In [30]:
confusion_matrix = pd.crosstab(df_fraud_input_train["fraud_bool"], df_fraud_input_train["fraud_bool"])
cramers_v(confusion_matrix.values)
Out[30]:
0.9999426978916621

En este paso, estamos comparando 'fraud_bool' consigo misma, por lo que el valor que nos da es 1 aproximado, ya qye es una asociación perfecta, auqnue no es muy informativa.

In [31]:
confusion_matrix = pd.crosstab(df_fraud_input_train["fraud_bool"], df_fraud_input_train["source"])
cramers_v(confusion_matrix.values)
Out[31]:
0.0044690710571317705

En este código comparamos 'fraud_bool' con la variable 'source'

Tratamiento de valores nulos¶

En las variables categoricas, los valores nulos se suelen sustituir por una nueva clase: "sin valor" o por la moda

In [32]:
df_fraud_input_train[list_var_cat] = df_fraud_input_train[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
df_fraud_input_test[list_var_cat] = df_fraud_input_test[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")

Guardado de la tabla¶

In [33]:
df_fraud_input_train.to_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
df_fraud_input_test.to_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")